import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('marketing_data.csv')
random5_samples = df.sample(5)
random5_samples.transpose()
| 497 | 164 | 825 | 2099 | 2134 | |
|---|---|---|---|---|---|
| ID | 1501 | 944 | 6885 | 6262 | 5885 |
| Year_Birth | 1982 | 1962 | 1977 | 1962 | 1973 |
| Education | PhD | Graduation | Graduation | Master | 2n Cycle |
| Marital_Status | Married | Married | Divorced | Single | Married |
| Income | $160,803.00 | $56,181.00 | $32,952.00 | $72,217.00 | $35,688.00 |
| Kidhome | 0 | 0 | 1 | 0 | 2 |
| Teenhome | 0 | 1 | 0 | 0 | 1 |
| Dt_Customer | 8/4/12 | 1/8/13 | 12/1/13 | 4/10/13 | 8/22/12 |
| Recency | 21 | 6 | 36 | 93 | 94 |
| MntWines | 55 | 121 | 38 | 816 | 73 |
| MntFruits | 16 | 103 | 0 | 42 | 3 |
| MntMeatProducts | 1622 | 69 | 12 | 450 | 90 |
| MntFishProducts | 17 | 8 | 3 | 55 | 12 |
| MntSweetProducts | 3 | 44 | 1 | 56 | 1 |
| MntGoldProds | 4 | 48 | 1 | 154 | 32 |
| NumDealsPurchases | 15 | 1 | 1 | 1 | 7 |
| NumWebPurchases | 0 | 4 | 2 | 3 | 4 |
| NumCatalogPurchases | 28 | 2 | 0 | 10 | 1 |
| NumStorePurchases | 1 | 7 | 3 | 8 | 4 |
| NumWebVisitsMonth | 0 | 4 | 7 | 2 | 8 |
| AcceptedCmp3 | 0 | 0 | 0 | 0 | 0 |
| AcceptedCmp4 | 0 | 0 | 0 | 0 | 0 |
| AcceptedCmp5 | 0 | 0 | 0 | 0 | 0 |
| AcceptedCmp1 | 0 | 0 | 0 | 0 | 0 |
| AcceptedCmp2 | 0 | 0 | 0 | 0 | 0 |
| Response | 0 | 0 | 0 | 0 | 0 |
| Complain | 0 | 0 | 0 | 0 | 0 |
| Country | US | SP | CA | GER | CA |
random5_samples.info()
# Findings
# There are 2240 entries and 28 columns
# Most of the columns has quatitative/continuous data
# 5 Cols has categorical data type.
# Income col is type of categorical but it should be converted to continuous data for the better data visualization.
# Hot encoding/Label encoding can be used for Marital_Status, Education and Country cols
# ID col is unique customer identfier and it can be dropped.
# As per above samples Almost all customers did not accpet the offer after campaign
# Remove ID column
# df=df.iloc[:,1:] # can remove col using col index
df = df.drop(['ID'], axis=1)
<class 'pandas.core.frame.DataFrame'> Int64Index: 5 entries, 497 to 2134 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5 non-null int64 1 Year_Birth 5 non-null int64 2 Education 5 non-null object 3 Marital_Status 5 non-null object 4 Income 5 non-null object 5 Kidhome 5 non-null int64 6 Teenhome 5 non-null int64 7 Dt_Customer 5 non-null object 8 Recency 5 non-null int64 9 MntWines 5 non-null int64 10 MntFruits 5 non-null int64 11 MntMeatProducts 5 non-null int64 12 MntFishProducts 5 non-null int64 13 MntSweetProducts 5 non-null int64 14 MntGoldProds 5 non-null int64 15 NumDealsPurchases 5 non-null int64 16 NumWebPurchases 5 non-null int64 17 NumCatalogPurchases 5 non-null int64 18 NumStorePurchases 5 non-null int64 19 NumWebVisitsMonth 5 non-null int64 20 AcceptedCmp3 5 non-null int64 21 AcceptedCmp4 5 non-null int64 22 AcceptedCmp5 5 non-null int64 23 AcceptedCmp1 5 non-null int64 24 AcceptedCmp2 5 non-null int64 25 Response 5 non-null int64 26 Complain 5 non-null int64 27 Country 5 non-null object dtypes: int64(23), object(5) memory usage: 1.3+ KB
df.describe(include="O")
| Education | Marital_Status | Income | Dt_Customer | Country | |
|---|---|---|---|---|---|
| count | 2240 | 2240 | 2216 | 2240 | 2240 |
| unique | 5 | 8 | 1974 | 663 | 8 |
| top | Graduation | Married | $7,500.00 | 8/31/12 | SP |
| freq | 1127 | 864 | 12 | 12 | 1095 |
df.describe(include=int).transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Year_Birth | 2240.0 | 1968.805804 | 11.984069 | 1893.0 | 1959.00 | 1970.0 | 1977.00 | 1996.0 |
| Kidhome | 2240.0 | 0.444196 | 0.538398 | 0.0 | 0.00 | 0.0 | 1.00 | 2.0 |
| Teenhome | 2240.0 | 0.506250 | 0.544538 | 0.0 | 0.00 | 0.0 | 1.00 | 2.0 |
| Recency | 2240.0 | 49.109375 | 28.962453 | 0.0 | 24.00 | 49.0 | 74.00 | 99.0 |
| MntWines | 2240.0 | 303.935714 | 336.597393 | 0.0 | 23.75 | 173.5 | 504.25 | 1493.0 |
| MntFruits | 2240.0 | 26.302232 | 39.773434 | 0.0 | 1.00 | 8.0 | 33.00 | 199.0 |
| MntMeatProducts | 2240.0 | 166.950000 | 225.715373 | 0.0 | 16.00 | 67.0 | 232.00 | 1725.0 |
| MntFishProducts | 2240.0 | 37.525446 | 54.628979 | 0.0 | 3.00 | 12.0 | 50.00 | 259.0 |
| MntSweetProducts | 2240.0 | 27.062946 | 41.280498 | 0.0 | 1.00 | 8.0 | 33.00 | 263.0 |
| MntGoldProds | 2240.0 | 44.021875 | 52.167439 | 0.0 | 9.00 | 24.0 | 56.00 | 362.0 |
| NumDealsPurchases | 2240.0 | 2.325000 | 1.932238 | 0.0 | 1.00 | 2.0 | 3.00 | 15.0 |
| NumWebPurchases | 2240.0 | 4.084821 | 2.778714 | 0.0 | 2.00 | 4.0 | 6.00 | 27.0 |
| NumCatalogPurchases | 2240.0 | 2.662054 | 2.923101 | 0.0 | 0.00 | 2.0 | 4.00 | 28.0 |
| NumStorePurchases | 2240.0 | 5.790179 | 3.250958 | 0.0 | 3.00 | 5.0 | 8.00 | 13.0 |
| NumWebVisitsMonth | 2240.0 | 5.316518 | 2.426645 | 0.0 | 3.00 | 6.0 | 7.00 | 20.0 |
| AcceptedCmp3 | 2240.0 | 0.072768 | 0.259813 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| AcceptedCmp4 | 2240.0 | 0.074554 | 0.262728 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| AcceptedCmp5 | 2240.0 | 0.072768 | 0.259813 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| AcceptedCmp1 | 2240.0 | 0.064286 | 0.245316 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| AcceptedCmp2 | 2240.0 | 0.013393 | 0.114976 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Response | 2240.0 | 0.149107 | 0.356274 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Complain | 2240.0 | 0.009375 | 0.096391 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
df.describe().median()
Year_Birth 1969.402902 Kidhome 0.491297 Teenhome 0.525394 Recency 49.054688 MntWines 320.266553 MntFruits 29.651116 MntMeatProducts 196.332686 MntFishProducts 43.762723 MntSweetProducts 30.031473 MntGoldProds 48.094657 NumDealsPurchases 2.162500 NumWebPurchases 4.042411 NumCatalogPurchases 2.792577 NumStorePurchases 5.395089 NumWebVisitsMonth 5.658259 AcceptedCmp3 0.036384 AcceptedCmp4 0.037277 AcceptedCmp5 0.036384 AcceptedCmp1 0.032143 AcceptedCmp2 0.006696 Response 0.074554 Complain 0.004687 dtype: float64
df.describe().quantile([0.25, 0.5, 0.75]).transpose()
| 0.25 | 0.50 | 0.75 | |
|---|---|---|---|
| Year_Birth | 1942.500000 | 1969.402902 | 1981.750000 |
| Kidhome | 0.000000 | 0.491297 | 1.250000 |
| Teenhome | 0.000000 | 0.525394 | 1.250000 |
| Recency | 27.721840 | 49.054688 | 80.250000 |
| MntWines | 136.062500 | 320.266553 | 751.437500 |
| MntFruits | 6.250000 | 29.651116 | 79.580075 |
| MntMeatProducts | 54.250000 | 196.332686 | 605.250000 |
| MntFishProducts | 9.750000 | 43.762723 | 105.721735 |
| MntSweetProducts | 6.250000 | 30.031473 | 96.710374 |
| MntGoldProds | 20.250000 | 48.094657 | 132.500000 |
| NumDealsPurchases | 1.699178 | 2.162500 | 6.000000 |
| NumWebPurchases | 2.584036 | 4.042411 | 11.250000 |
| NumCatalogPurchases | 1.500000 | 2.792577 | 10.000000 |
| NumStorePurchases | 3.188219 | 5.395089 | 9.250000 |
| NumWebVisitsMonth | 2.856661 | 5.658259 | 10.250000 |
| AcceptedCmp3 | 0.000000 | 0.036384 | 0.444860 |
| AcceptedCmp4 | 0.000000 | 0.037277 | 0.447046 |
| AcceptedCmp5 | 0.000000 | 0.036384 | 0.444860 |
| AcceptedCmp1 | 0.000000 | 0.032143 | 0.433987 |
| AcceptedCmp2 | 0.000000 | 0.006696 | 0.336232 |
| Response | 0.000000 | 0.074554 | 0.517205 |
| Complain | 0.000000 | 0.004687 | 0.322293 |
df.isnull().sum()
Year_Birth 0 Education 0 Marital_Status 0 Income 24 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Response 0 Complain 0 Country 0 dtype: int64
Only "Income" col has 24 null entries out of total 2240 entries.
# Rename Income col
df.rename(columns={' Income ': 'Income'}, inplace=True)
df.columns
Index(['Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Response', 'Complain', 'Country'],
dtype='object')
# List of missing values in Income col
df[pd.isna(df['Income'])]
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 134 | 1957 | PhD | Married | NaN | 2 | 1 | 11/19/12 | 4 | 230 | 42 | ... | 8 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | GER |
| 262 | 1983 | Graduation | Married | NaN | 1 | 0 | 11/15/13 | 11 | 5 | 5 | ... | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
| 394 | 1972 | PhD | Together | NaN | 1 | 0 | 3/2/14 | 17 | 25 | 1 | ... | 3 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 449 | 1986 | Graduation | Single | NaN | 1 | 0 | 2/20/13 | 19 | 5 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 525 | 1961 | PhD | Married | NaN | 0 | 1 | 7/11/13 | 23 | 352 | 0 | ... | 7 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | CA |
| 590 | 1973 | 2n Cycle | Married | NaN | 1 | 0 | 9/14/12 | 25 | 25 | 3 | ... | 3 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | GER |
| 899 | 1970 | Master | Together | NaN | 0 | 1 | 4/1/13 | 39 | 187 | 5 | ... | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
| 997 | 1957 | Graduation | Single | NaN | 1 | 1 | 5/27/14 | 45 | 7 | 0 | ... | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | GER |
| 1096 | 1964 | 2n Cycle | Single | NaN | 1 | 1 | 1/12/14 | 49 | 5 | 1 | ... | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 1185 | 1969 | Master | Together | NaN | 1 | 1 | 5/18/13 | 52 | 375 | 42 | ... | 4 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 1213 | 1978 | 2n Cycle | Together | NaN | 0 | 0 | 8/12/12 | 53 | 32 | 2 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | IND |
| 1312 | 1982 | Graduation | Single | NaN | 1 | 0 | 6/17/13 | 57 | 11 | 3 | ... | 3 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 1515 | 1970 | Graduation | Single | NaN | 1 | 2 | 8/23/13 | 67 | 738 | 20 | ... | 10 | 7 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | SP |
| 1558 | 1989 | Graduation | Married | NaN | 0 | 0 | 6/3/13 | 69 | 861 | 138 | ... | 12 | 3 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | SP |
| 1693 | 1943 | Master | Widow | NaN | 0 | 0 | 10/30/13 | 75 | 532 | 126 | ... | 11 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | AUS |
| 1804 | 1959 | PhD | Single | NaN | 0 | 0 | 11/5/13 | 80 | 81 | 11 | ... | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 1858 | 1981 | PhD | Single | NaN | 1 | 0 | 5/31/13 | 82 | 23 | 0 | ... | 3 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 1863 | 1971 | Graduation | Married | NaN | 1 | 1 | 3/3/13 | 82 | 71 | 1 | ... | 3 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 1880 | 1954 | Master | Together | NaN | 0 | 1 | 6/23/13 | 83 | 161 | 0 | ... | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 1967 | 1973 | Master | Together | NaN | 0 | 0 | 11/23/13 | 87 | 445 | 37 | ... | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | GER |
| 1983 | 1958 | Graduation | Together | NaN | 1 | 1 | 9/3/12 | 87 | 19 | 4 | ... | 3 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 2139 | 1955 | Graduation | Single | NaN | 0 | 1 | 10/18/13 | 95 | 264 | 0 | ... | 5 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 2165 | 1951 | Graduation | Single | NaN | 2 | 1 | 1/1/14 | 96 | 48 | 5 | ... | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 2170 | 1963 | Graduation | Married | NaN | 0 | 1 | 8/11/13 | 96 | 231 | 65 | ... | 7 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | CA |
24 rows × 27 columns
# Check most common value in Income col
most_common_income = df['Income'].value_counts().idxmax()
print(most_common_income)
# Check null percentage
df['Income'].isna().sum()/df.shape[0]*100
$7,500.00
1.0714285714285714
# Rows having null income values are very less so we should drop those
df.dropna(inplace=True)
df.count()
Year_Birth 2216 Education 2216 Marital_Status 2216 Income 2216 Kidhome 2216 Teenhome 2216 Dt_Customer 2216 Recency 2216 MntWines 2216 MntFruits 2216 MntMeatProducts 2216 MntFishProducts 2216 MntSweetProducts 2216 MntGoldProds 2216 NumDealsPurchases 2216 NumWebPurchases 2216 NumCatalogPurchases 2216 NumStorePurchases 2216 NumWebVisitsMonth 2216 AcceptedCmp3 2216 AcceptedCmp4 2216 AcceptedCmp5 2216 AcceptedCmp1 2216 AcceptedCmp2 2216 Response 2216 Complain 2216 Country 2216 dtype: int64
df.head(1).T
| 0 | |
|---|---|
| Year_Birth | 1970 |
| Education | Graduation |
| Marital_Status | Divorced |
| Income | $84,835.00 |
| Kidhome | 0 |
| Teenhome | 0 |
| Dt_Customer | 6/16/14 |
| Recency | 0 |
| MntWines | 189 |
| MntFruits | 104 |
| MntMeatProducts | 379 |
| MntFishProducts | 111 |
| MntSweetProducts | 189 |
| MntGoldProds | 218 |
| NumDealsPurchases | 1 |
| NumWebPurchases | 4 |
| NumCatalogPurchases | 4 |
| NumStorePurchases | 6 |
| NumWebVisitsMonth | 1 |
| AcceptedCmp3 | 0 |
| AcceptedCmp4 | 0 |
| AcceptedCmp5 | 0 |
| AcceptedCmp1 | 0 |
| AcceptedCmp2 | 0 |
| Response | 1 |
| Complain | 0 |
| Country | SP |
# remove $ and , characters then convert it into continuous variable
df['Income'] = df['Income'].apply(lambda x: float(x.split('$')[1].replace(',', '')))
df.head()
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1970 | Graduation | Divorced | 84835.0 | 0 | 0 | 6/16/14 | 0 | 189 | 104 | ... | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
| 1 | 1961 | Graduation | Single | 57091.0 | 0 | 0 | 6/15/14 | 0 | 464 | 5 | ... | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA |
| 2 | 1958 | Graduation | Married | 67267.0 | 0 | 1 | 5/13/14 | 0 | 134 | 11 | ... | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
| 3 | 1967 | Graduation | Together | 32474.0 | 1 | 1 | 5/11/14 | 0 | 10 | 0 | ... | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 4 | 1989 | Graduation | Single | 21474.0 | 1 | 0 | 4/8/14 | 0 | 6 | 16 | ... | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
5 rows × 27 columns
df['Age'] = 2022 - df['Year_Birth']
df[['Age', 'Year_Birth']].head()
| Age | Year_Birth | |
|---|---|---|
| 0 | 52 | 1970 |
| 1 | 61 | 1961 |
| 2 | 64 | 1958 |
| 3 | 55 | 1967 |
| 4 | 33 | 1989 |
df['Age'].describe()
count 2216.000000 mean 53.179603 std 11.985554 min 26.000000 25% 45.000000 50% 52.000000 75% 63.000000 max 129.000000 Name: Age, dtype: float64
df.columns
Index(['Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Response', 'Complain', 'Country', 'Age'],
dtype='object')
We have 6 products for customers.
df['Total_Amount'] = df['MntWines'] + df['MntFruits']+ df['MntMeatProducts']+ df['MntFishProducts']+ df['MntSweetProducts']+ df['MntGoldProds']
df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'Total_Amount']].head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| MntWines | 189 | 464 | 134 | 10 | 6 |
| MntFruits | 104 | 5 | 11 | 0 | 16 |
| MntMeatProducts | 379 | 64 | 59 | 1 | 24 |
| MntFishProducts | 111 | 7 | 15 | 0 | 11 |
| MntSweetProducts | 189 | 0 | 2 | 0 | 0 |
| MntGoldProds | 218 | 37 | 30 | 0 | 34 |
| Total_Amount | 1190 | 577 | 251 | 11 | 91 |
products_df = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']]
products_df.head()
| MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | |
|---|---|---|---|---|---|---|
| 0 | 189 | 104 | 379 | 111 | 189 | 218 |
| 1 | 464 | 5 | 64 | 7 | 0 | 37 |
| 2 | 134 | 11 | 59 | 15 | 2 | 30 |
| 3 | 10 | 0 | 1 | 0 | 0 | 0 |
| 4 | 6 | 16 | 24 | 11 | 0 | 34 |
products_df.apply(lambda x: x/df['Total_Amount']*100)
| MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | |
|---|---|---|---|---|---|---|
| 0 | 15.882353 | 8.739496 | 31.848739 | 9.327731 | 15.882353 | 18.319328 |
| 1 | 80.415945 | 0.866551 | 11.091854 | 1.213172 | 0.000000 | 6.412478 |
| 2 | 53.386454 | 4.382470 | 23.505976 | 5.976096 | 0.796813 | 11.952191 |
| 3 | 90.909091 | 0.000000 | 9.090909 | 0.000000 | 0.000000 | 0.000000 |
| 4 | 6.593407 | 17.582418 | 26.373626 | 12.087912 | 0.000000 | 37.362637 |
| ... | ... | ... | ... | ... | ... | ... |
| 2235 | 53.991292 | 2.612482 | 18.287373 | 6.821480 | 6.966618 | 11.320755 |
| 2236 | 9.090909 | 18.181818 | 23.636364 | 5.454545 | 14.545455 | 29.090909 |
| 2237 | 59.870550 | 0.647249 | 28.478964 | 4.854369 | 1.618123 | 4.530744 |
| 2238 | 19.305857 | 2.747650 | 50.686913 | 10.773680 | 11.930586 | 4.555315 |
| 2239 | 15.677180 | 2.226345 | 51.298701 | 17.439703 | 0.000000 | 13.358071 |
2216 rows × 6 columns
List of different purchase channels
df['Total_Purchase'] = df['NumDealsPurchases'] + df['NumWebPurchases'] + df['NumCatalogPurchases'] + df['NumStorePurchases']
purchase_channels_df = df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'Total_Purchase']]
purchase_channels_df.head()
| NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | Total_Purchase | |
|---|---|---|---|---|---|
| 0 | 1 | 4 | 4 | 6 | 15 |
| 1 | 1 | 7 | 3 | 7 | 18 |
| 2 | 1 | 3 | 2 | 5 | 11 |
| 3 | 1 | 1 | 0 | 2 | 4 |
| 4 | 2 | 3 | 1 | 2 | 8 |
purchase_channels_df.iloc[:, :4].describe()
| NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | |
|---|---|---|---|---|
| count | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 |
| mean | 2.323556 | 4.085289 | 2.671029 | 5.800993 |
| std | 1.923716 | 2.740951 | 2.926734 | 3.250785 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1.000000 | 2.000000 | 0.000000 | 3.000000 |
| 50% | 2.000000 | 4.000000 | 2.000000 | 5.000000 |
| 75% | 3.000000 | 6.000000 | 4.000000 | 8.000000 |
| max | 15.000000 | 27.000000 | 28.000000 | 13.000000 |
df[['NumStorePurchases', 'NumCatalogPurchases', 'NumWebPurchases', 'NumDealsPurchases']].apply(lambda x: x/df['Total_Purchase']*100)
| NumStorePurchases | NumCatalogPurchases | NumWebPurchases | NumDealsPurchases | |
|---|---|---|---|---|
| 0 | 40.000000 | 26.666667 | 26.666667 | 6.666667 |
| 1 | 38.888889 | 16.666667 | 38.888889 | 5.555556 |
| 2 | 45.454545 | 18.181818 | 27.272727 | 9.090909 |
| 3 | 50.000000 | 0.000000 | 25.000000 | 25.000000 |
| 4 | 25.000000 | 12.500000 | 37.500000 | 25.000000 |
| ... | ... | ... | ... | ... |
| 2235 | 55.000000 | 10.000000 | 25.000000 | 10.000000 |
| 2236 | 60.000000 | 0.000000 | 20.000000 | 20.000000 |
| 2237 | 35.714286 | 7.142857 | 42.857143 | 14.285714 |
| 2238 | 50.000000 | 20.000000 | 25.000000 | 5.000000 |
| 2239 | 22.222222 | 27.777778 | 44.444444 | 5.555556 |
2216 rows × 4 columns
import warnings
warnings.filterwarnings('ignore')
campaigns_df = df[['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']]
campaigns_df.head()
| AcceptedCmp1 | AcceptedCmp2 | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | Response | |
|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 1 | 0 | 0 | 1 |
sns.countplot(campaigns_df['AcceptedCmp1'])
<AxesSubplot:xlabel='AcceptedCmp1', ylabel='count'>
sns.countplot(campaigns_df['AcceptedCmp2'])
<AxesSubplot:xlabel='AcceptedCmp2', ylabel='count'>
sns.countplot(campaigns_df['AcceptedCmp3'])
<AxesSubplot:xlabel='AcceptedCmp3', ylabel='count'>
sns.countplot(campaigns_df['AcceptedCmp4'])
<AxesSubplot:xlabel='AcceptedCmp4', ylabel='count'>
sns.countplot(campaigns_df['AcceptedCmp5'])
<AxesSubplot:xlabel='AcceptedCmp5', ylabel='count'>
# Last campaign
sns.countplot(df['Response'])
<AxesSubplot:xlabel='Response', ylabel='count'>
campaigns_df = pd.melt(campaigns_df)
sns.set(rc={'figure.figsize':(10.7,6.27)})
# ax = sns.countplot(data=campaigns_df, x="value") # shows value count for all campaigns
# ax = sns.countplot(data=campaigns_df, x="value", hue="variable") # another way
ax = sns.countplot(data=campaigns_df, x="variable", hue="value")
for container in ax.containers:
ax.bar_label(container)
plt.show()
sns.reset_orig()
Findings
0 shows that customer accepted the offer.1 shows that customer did not accept the offer.0 in each campaign is very large that shows each campaign is great failure.Last campaign ('Response') is most successful.products_df_with_revenue = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'Total_Amount']]
products_df_with_revenue.head()
| MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | Total_Amount | |
|---|---|---|---|---|---|---|---|
| 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1190 |
| 1 | 464 | 5 | 64 | 7 | 0 | 37 | 577 |
| 2 | 134 | 11 | 59 | 15 | 2 | 30 | 251 |
| 3 | 10 | 0 | 1 | 0 | 0 | 0 | 11 |
| 4 | 6 | 16 | 24 | 11 | 0 | 34 | 91 |
products_df_with_revenue.sum()
MntWines 676083 MntFruits 58405 MntMeatProducts 370063 MntFishProducts 83405 MntSweetProducts 59896 MntGoldProds 97427 Total_Amount 1345279 dtype: int64
product_revenue_percentage = products_df.sum() / df['Total_Amount'].sum()*100
product_revenue_percentage
MntWines 50.255969 MntFruits 4.341479 MntMeatProducts 27.508272 MntFishProducts 6.199829 MntSweetProducts 4.452311 MntGoldProds 7.242141 dtype: float64
Findings
MntWines product with 676083 (50.26% of total).MntFruits product with 58405 (4.34% of total).# product_revenue_percentage.plot(kind='pie')
plt.pie(product_revenue_percentage,labels=product_revenue_percentage.index, autopct='%1.2f%%')
plt.title('Product wise generated Revenue')
plt.show()
last_cmp_acc_cust_df = df[df['Response'] == 1]
last_cmp_acc_cust_df.head().T
| 0 | 1 | 4 | 5 | 6 | |
|---|---|---|---|---|---|
| Year_Birth | 1970 | 1961 | 1989 | 1958 | 1954 |
| Education | Graduation | Graduation | Graduation | PhD | 2n Cycle |
| Marital_Status | Divorced | Single | Single | Single | Married |
| Income | 84835.0 | 57091.0 | 21474.0 | 71691.0 | 63564.0 |
| Kidhome | 0 | 0 | 1 | 0 | 0 |
| Teenhome | 0 | 0 | 0 | 0 | 0 |
| Dt_Customer | 6/16/14 | 6/15/14 | 4/8/14 | 3/17/14 | 1/29/14 |
| Recency | 0 | 0 | 0 | 0 | 0 |
| MntWines | 189 | 464 | 6 | 336 | 769 |
| MntFruits | 104 | 5 | 16 | 130 | 80 |
| MntMeatProducts | 379 | 64 | 24 | 411 | 252 |
| MntFishProducts | 111 | 7 | 11 | 240 | 15 |
| MntSweetProducts | 189 | 0 | 0 | 32 | 34 |
| MntGoldProds | 218 | 37 | 34 | 43 | 65 |
| NumDealsPurchases | 1 | 1 | 2 | 1 | 1 |
| NumWebPurchases | 4 | 7 | 3 | 4 | 10 |
| NumCatalogPurchases | 4 | 3 | 1 | 7 | 10 |
| NumStorePurchases | 6 | 7 | 2 | 5 | 7 |
| NumWebVisitsMonth | 1 | 5 | 7 | 2 | 6 |
| AcceptedCmp3 | 0 | 0 | 1 | 0 | 1 |
| AcceptedCmp4 | 0 | 0 | 0 | 0 | 0 |
| AcceptedCmp5 | 0 | 0 | 0 | 0 | 0 |
| AcceptedCmp1 | 0 | 0 | 0 | 0 | 0 |
| AcceptedCmp2 | 0 | 1 | 0 | 0 | 0 |
| Response | 1 | 1 | 1 | 1 | 1 |
| Complain | 0 | 0 | 0 | 0 | 0 |
| Country | SP | CA | SP | SP | GER |
| Age | 52 | 61 | 33 | 64 | 68 |
| Total_Amount | 1190 | 577 | 91 | 1192 | 1215 |
| Total_Purchase | 15 | 18 | 8 | 17 | 28 |
sns.displot(last_cmp_acc_cust_df['Age'], kde=True)
plt.show()
sns.pairplot(df[['Age', 'Response']], diag_kind='kde')
plt.show()
age_response_corr = df[['Age', 'Response']].corr()
age_response_corr
| Age | Response | |
|---|---|---|
| Age | 1.000000 | -0.023692 |
| Response | -0.023692 | 1.000000 |
sns.heatmap(age_response_corr, annot=True)
plt.show()
Findings
-0.02 that shows week correlation found between Age and Response (Last Campaign)ax = sns.countplot(data=last_cmp_acc_cust_df, x='Country')
for container in ax.containers:
ax.bar_label(container)
plt.show()
Approach:
Result: Country SP has 176 customers who accepted the last campaign offer.
df['Dependents'] = df['Kidhome']+df['Teenhome']
df[['Total_Amount', 'Dependents']].sort_values(by='Total_Amount', ascending=False).head(20).T
| 672 | 671 | 1404 | 1026 | 1864 | 607 | 376 | 1811 | 1409 | 1408 | 1595 | 2055 | 696 | 1656 | 1293 | 1054 | 807 | 927 | 722 | 724 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Total_Amount | 2525 | 2525 | 2524 | 2486 | 2440 | 2352 | 2349 | 2346 | 2302 | 2302 | 2283 | 2279 | 2257 | 2252 | 2231 | 2217 | 2211 | 2209 | 2194 | 2157 |
| Dependents | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
df[['Total_Amount', 'Dependents']].sort_values(by='Total_Amount', ascending=True).head(20).T
| 1291 | 456 | 1826 | 32 | 1472 | 1925 | 1466 | 233 | 663 | 1420 | 1805 | 1194 | 2156 | 1689 | 1365 | 592 | 1866 | 550 | 3 | 871 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Total_Amount | 5 | 6 | 6 | 8 | 8 | 8 | 8 | 9 | 9 | 10 | 10 | 10 | 10 | 10 | 11 | 11 | 11 | 11 | 11 | 12 |
| Dependents | 1 | 0 | 0 | 3 | 3 | 0 | 0 | 2 | 2 | 1 | 2 | 1 | 3 | 1 | 2 | 2 | 1 | 2 | 2 | 1 |
gb_ta_deps = df[['Total_Amount', 'Dependents']].groupby(by='Dependents')
gb_ta_deps.first()
| Total_Amount | |
|---|---|
| Dependents | |
| 0 | 1190 |
| 1 | 251 |
| 2 | 11 |
| 3 | 13 |
ta_depts_corr = df[['Total_Amount', 'Dependents']].corr()
ta_depts_corr
| Total_Amount | Dependents | |
|---|---|---|
| Total_Amount | 1.000000 | -0.500244 |
| Dependents | -0.500244 | 1.000000 |
sns.heatmap(ta_depts_corr, annot=True, cmap="YlGnBu")
<AxesSubplot:>
Inferences:
Total_Amount and Dependents using heatmap.Total_Amount and Dependents.total amount spent (Total_Amount) decreases.df[['Total_Purchase', 'Dependents']].sort_values(by='Total_Purchase', ascending=False).head(20).T
| 497 | 961 | 1718 | 2204 | 41 | 1396 | 2195 | 2196 | 661 | 1262 | 1997 | 1193 | 1059 | 471 | 1686 | 884 | 850 | 1438 | 435 | 1439 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Total_Purchase | 44 | 43 | 39 | 37 | 35 | 34 | 34 | 34 | 34 | 33 | 33 | 33 | 33 | 32 | 32 | 32 | 32 | 32 | 32 | 32 |
| Dependents | 0 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 2 | 2 | 1 | 1 | 1 | 1 | 1 |
df[['Total_Purchase', 'Dependents']].sort_values(by='Total_Purchase', ascending=True).head(20).T
| 456 | 1291 | 1826 | 1925 | 1721 | 731 | 299 | 60 | 1533 | 853 | 1951 | 481 | 1468 | 1739 | 824 | 871 | 114 | 1127 | 1484 | 808 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Total_Purchase | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| Dependents | 0 | 1 | 0 | 0 | 1 | 2 | 1 | 0 | 2 | 1 | 2 | 2 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
gb_tp_deps = df[['Total_Purchase', 'Dependents']].groupby(by='Dependents')
gb_tp_deps.first()
| Total_Purchase | |
|---|---|
| Dependents | |
| 0 | 15 |
| 1 | 11 |
| 2 | 4 |
| 3 | 4 |
tp_depts_corr = df[['Total_Purchase', 'Dependents']].corr()
tp_depts_corr
| Total_Purchase | Dependents | |
|---|---|---|
| Total_Purchase | 1.000000 | -0.250006 |
| Dependents | -0.250006 | 1.000000 |
sns.heatmap(tp_depts_corr, annot=True)
<AxesSubplot:>
Inferences
Dependents increases, Total_Purchase decreases and same for vice-versa.df_copy = df.copy()
df_copy.drop(axis=1, inplace=True, columns=['Year_Birth', 'Kidhome', 'Teenhome', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntSweetProducts', 'MntFishProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases'])
df_copy.head()
| Education | Marital_Status | Income | Dt_Customer | Recency | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | Age | Total_Amount | Total_Purchase | Dependents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduation | Divorced | 84835.0 | 6/16/14 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 52 | 1190 | 15 | 0 |
| 1 | Graduation | Single | 57091.0 | 6/15/14 | 0 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA | 61 | 577 | 18 | 0 |
| 2 | Graduation | Married | 67267.0 | 5/13/14 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US | 64 | 251 | 11 | 1 |
| 3 | Graduation | Together | 32474.0 | 5/11/14 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS | 55 | 11 | 4 | 2 |
| 4 | Graduation | Single | 21474.0 | 4/8/14 | 0 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 33 | 91 | 8 | 1 |
df_copy_corr = df_copy.corr()
df_copy_corr
| Income | Recency | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Age | Total_Amount | Total_Purchase | Dependents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Income | 1.000000 | -0.003970 | -0.553088 | -0.016174 | 0.184400 | 0.335943 | 0.276820 | 0.087545 | 0.133047 | -0.027225 | 0.161791 | 0.667576 | 0.566873 | -0.293352 |
| Recency | -0.003970 | 1.000000 | -0.018564 | -0.032257 | 0.017566 | -0.000482 | -0.021061 | -0.001400 | -0.199766 | 0.013637 | 0.016295 | 0.020066 | 0.007519 | 0.018290 |
| NumWebVisitsMonth | -0.553088 | -0.018564 | 1.000000 | 0.061307 | -0.028666 | -0.277883 | -0.194773 | -0.007362 | -0.002209 | 0.019785 | -0.123904 | -0.499082 | -0.313926 | 0.416076 |
| AcceptedCmp3 | -0.016174 | -0.032257 | 0.061307 | 1.000000 | -0.079659 | 0.080248 | 0.095683 | 0.071702 | 0.254005 | 0.008124 | -0.061784 | 0.053041 | 0.020155 | -0.019376 |
| AcceptedCmp4 | 0.184400 | 0.017566 | -0.028666 | -0.079659 | 1.000000 | 0.311314 | 0.242782 | 0.295050 | 0.180205 | -0.027652 | 0.066109 | 0.248805 | 0.191175 | -0.088254 |
| AcceptedCmp5 | 0.335943 | -0.000482 | -0.277883 | 0.080248 | 0.311314 | 1.000000 | 0.407878 | 0.222121 | 0.323374 | -0.009576 | -0.010575 | 0.470278 | 0.217521 | -0.285761 |
| AcceptedCmp1 | 0.276820 | -0.021061 | -0.194773 | 0.095683 | 0.242782 | 0.407878 | 1.000000 | 0.176637 | 0.297345 | -0.025594 | 0.009611 | 0.380825 | 0.218626 | -0.230068 |
| AcceptedCmp2 | 0.087545 | -0.001400 | -0.007362 | 0.071702 | 0.295050 | 0.222121 | 0.176637 | 1.000000 | 0.169294 | -0.011459 | 0.006717 | 0.136161 | 0.077177 | -0.069955 |
| Response | 0.133047 | -0.199766 | -0.002209 | 0.254005 | 0.180205 | 0.323374 | 0.297345 | 0.169294 | 1.000000 | -0.002029 | -0.023692 | 0.264127 | 0.154237 | -0.167648 |
| Complain | -0.027225 | 0.013637 | 0.019785 | 0.008124 | -0.027652 | -0.009576 | -0.025594 | -0.011459 | -0.002029 | 1.000000 | 0.030407 | -0.037428 | -0.020952 | 0.031774 |
| Age | 0.161791 | 0.016295 | -0.123904 | -0.061784 | 0.066109 | -0.010575 | 0.009611 | 0.006717 | -0.023692 | 0.030407 | 1.000000 | 0.113487 | 0.170055 | 0.087398 |
| Total_Amount | 0.667576 | 0.020066 | -0.499082 | 0.053041 | 0.248805 | 0.470278 | 0.380825 | 0.136161 | 0.264127 | -0.037428 | 0.113487 | 1.000000 | 0.756403 | -0.500244 |
| Total_Purchase | 0.566873 | 0.007519 | -0.313926 | 0.020155 | 0.191175 | 0.217521 | 0.218626 | 0.077177 | 0.154237 | -0.020952 | 0.170055 | 0.756403 | 1.000000 | -0.250006 |
| Dependents | -0.293352 | 0.018290 | 0.416076 | -0.019376 | -0.088254 | -0.285761 | -0.230068 | -0.069955 | -0.167648 | 0.031774 | 0.087398 | -0.500244 | -0.250006 | 1.000000 |
sns.set(rc={'figure.figsize':(16.7,7.27)})
sns.heatmap(df_copy_corr, annot=True, annot_kws={"size": 8}, fmt=".2")
sns.reset_orig()
Inferences Postive correlation found in between
Income and AcceptedCmp1 is 0.28.Income and AcceptedCmp5 is 0.34.Income and Total_Amount is 0.67. StrongIncome and Total_Purchase is 0.57. StrongNumWebVisitsMonth and Dependents is 0.42.AcceptedCmp3 and Response is 0.25.AcceptedCmp4 and AcceptedCmp5 is 0.31.AcceptedCmp4 and AcceptedCmp1 is 0.24.AcceptedCmp4 and AcceptedCmp2 is 0.3.AcceptedCmp5 and Total_Amount is 0.47.Total_Purchase and Total_Amount is 0.76. StrongNegative correlation found in between
NumWebVisitsMonth and Income is -0.55. StrongNumWebVisitsMonth and Total_Amount is -0.5. StrongDependents and Total_Amount is -0.5. StrongNote:
complained_cust_df = df[df['Complain'] == 1]
complained_cust_df.drop(axis=1, columns=['Complain'], inplace=True)
df.groupby('Education').first()
| Year_Birth | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | ... | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | Age | Total_Amount | Total_Purchase | Dependents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Education | |||||||||||||||||||||
| 2n Cycle | 1954 | Married | 63564.0 | 0 | 0 | 1/29/14 | 0 | 769 | 80 | 252 | ... | 0 | 0 | 0 | 1 | 0 | GER | 68 | 1215 | 28 | 0 |
| Basic | 1974 | Married | 18393.0 | 1 | 0 | 3/29/14 | 2 | 7 | 10 | 13 | ... | 0 | 0 | 0 | 0 | 0 | SP | 48 | 50 | 8 | 1 |
| Graduation | 1970 | Divorced | 84835.0 | 0 | 0 | 6/16/14 | 0 | 189 | 104 | 379 | ... | 0 | 0 | 0 | 1 | 0 | SP | 52 | 1190 | 15 | 0 |
| Master | 1979 | Together | 62499.0 | 1 | 0 | 12/9/13 | 0 | 140 | 4 | 61 | ... | 0 | 0 | 0 | 0 | 0 | SP | 43 | 222 | 12 | 1 |
| PhD | 1958 | Single | 71691.0 | 0 | 0 | 3/17/14 | 0 | 336 | 130 | 411 | ... | 0 | 0 | 0 | 1 | 0 | SP | 64 | 1192 | 17 | 0 |
5 rows × 30 columns
gb_education = complained_cust_df.groupby('Education')
gb_education.first()
| Year_Birth | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | ... | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Country | Age | Total_Amount | Total_Purchase | Dependents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Education | |||||||||||||||||||||
| 2n Cycle | 1988 | Married | 15716.0 | 1 | 0 | 11/12/12 | 8 | 16 | 5 | 30 | ... | 0 | 0 | 0 | 0 | 0 | SA | 34 | 92 | 10 | 1 |
| Graduation | 1959 | Together | 57957.0 | 0 | 1 | 8/19/13 | 24 | 290 | 59 | 177 | ... | 0 | 0 | 0 | 0 | 0 | CA | 63 | 637 | 25 | 1 |
| Master | 1983 | Single | 31788.0 | 1 | 0 | 3/20/14 | 15 | 16 | 7 | 23 | ... | 0 | 0 | 0 | 0 | 0 | SA | 39 | 60 | 10 | 1 |
| PhD | 1984 | Single | 39684.0 | 1 | 0 | 10/12/12 | 41 | 19 | 0 | 35 | ... | 0 | 0 | 0 | 0 | 1 | SP | 38 | 81 | 7 | 1 |
4 rows × 29 columns
ax = sns.countplot(data=complained_cust_df, x='Education')
for container in ax.containers:
ax.bar_label(container)
plt.show()
Findings
Basic never complained.Graduation education background complained most.all_products = pd.concat([products_df, df[['Total_Amount', 'Response', 'Education', 'Marital_Status']]], axis=1)
all_products.head()
| MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | Total_Amount | Response | Education | Marital_Status | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1190 | 1 | Graduation | Divorced |
| 1 | 464 | 5 | 64 | 7 | 0 | 37 | 577 | 1 | Graduation | Single |
| 2 | 134 | 11 | 59 | 15 | 2 | 30 | 251 | 0 | Graduation | Married |
| 3 | 10 | 0 | 1 | 0 | 0 | 0 | 11 | 0 | Graduation | Together |
| 4 | 6 | 16 | 24 | 11 | 0 | 34 | 91 | 1 | Graduation | Single |
sns.pairplot(data=all_products, hue='Response')
<seaborn.axisgrid.PairGrid at 0x7f8f05f8a4c0>
Findings in terms of response.
MntWines and Total_Amount has partial positive linear relatioship.MntMeatProducts and Total_Amount has partial positive linear relatioship.all_products.drop(axis=1, columns=['Response'], inplace=True)
sns.pairplot(data=all_products, hue='Education')
<seaborn.axisgrid.PairGrid at 0x7f8eee7b1910>
Findings in terms of Education.
MntWines and Total_Amount has partial positive linear relatioship.MntMeatProducts and Total_Amount has partial positive linear relatioship.Graduation and PhD contributed most.sns.pairplot(data=all_products, hue='Marital_Status')
<seaborn.axisgrid.PairGrid at 0x7f8edf169cd0>
Findings in terms of Marital Status.
MntWines and Total_Amount has partial positive linear relatioship.MntMeatProducts and Total_Amount has partial positive linear relatioship.